In [121]:
# Author: Stephen Situ
# In This project, we create many models for a NLP (Natural Language Processing) Multiclass Classification model using  Multinomial Naive Bayes,
# simple dense NN, LSTM NN, GRU NN, Bidirectional NN, Conv 1D NN, and a tensorflow hub sentence encoder model. To preprocess the data, we create
# a text_vectorizer object and create embedding layers for our Neural Network that can be trained. Afterwards, we investigate various
# Classification metrics like accuracy, recall, precision, and F-1 score. We found that LSTM, Bidirectional, and GRU were the best models,
# and then we combined them to create an ensemble model that outpreformed all of them individually. We also investigated the speed/score tradeoff
# and found that the simple dense model preformed the best in this metric. Finally, we extracted the weights and the vocab of the LSTM embedded layer
# and visualized the word data on tensorflow projector (https://projector.tensorflow.org/).

# The Original tweet data can be found here: https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification
In [ ]:
# Import Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
In [30]:
# Read csv
df_1 = pd.read_csv("Corona_NLP_test.csv",encoding='iso-8859-1')
df_2 = pd.read_csv("Corona_NLP_train.csv",encoding='iso-8859-1')
df_3 = pd.concat([df_1, df_2])
In [31]:
# Drop duplicates
df_3 = df_3.drop_duplicates()
In [32]:
# Drop N/A's
df_3 = df_3.dropna(subset=['OriginalTweet', 'Sentiment'])
In [33]:
# Find unique values
unique_values = df_3['Sentiment'].unique()
unique_values
Out[33]:
array(['Extremely Negative', 'Positive', 'Extremely Positive', 'Negative',
       'Neutral'], dtype=object)
In [34]:
# Define function to replace labels with integers
def replace_sentiment(sentiment):
    if sentiment == 'Extremely Negative':
        return 0
    elif sentiment == 'Negative':
        return 1
    elif sentiment == 'Neutral':
        return 2
    elif sentiment == 'Positive':
        return 3
    elif sentiment == 'Extremely Positive':
        return 4
    else:
        return sentiment
In [35]:
# Use .apply method to replace
df_3['Sentiment'] = df_3['Sentiment'].apply(replace_sentiment)
In [36]:
# Find unique values
unique_values = df_3['Sentiment'].unique()
unique_values
Out[36]:
array([0, 3, 4, 1, 2], dtype=int64)
In [41]:
# Check data types
df_3.dtypes
Out[41]:
UserName          int64
ScreenName        int64
Location         object
TweetAt          object
OriginalTweet    object
Sentiment         int64
dtype: object
In [42]:
df_3
Out[42]:
UserName ScreenName Location TweetAt OriginalTweet Sentiment
0 1 44953 NYC 02-03-2020 TRENDING: New Yorkers encounter empty supermar... 0
1 2 44954 Seattle, WA 02-03-2020 When I couldn't find hand sanitizer at Fred Me... 3
2 3 44955 NaN 02-03-2020 Find out how you can protect yourself and love... 4
3 4 44956 Chicagoland 02-03-2020 #Panic buying hits #NewYork City as anxious sh... 1
4 5 44957 Melbourne, Victoria 03-03-2020 #toiletpaper #dunnypaper #coronavirus #coronav... 2
... ... ... ... ... ... ...
41152 44951 89903 Wellington City, New Zealand 14-04-2020 Airline pilots offering to stock supermarket s... 2
41153 44952 89904 NaN 14-04-2020 Response to complaint not provided citing COVI... 0
41154 44953 89905 NaN 14-04-2020 You know it’s getting tough when @KameronWild... 3
41155 44954 89906 NaN 14-04-2020 Is it wrong that the smell of hand sanitizer i... 2
41156 44955 89907 i love you so much || he/him 14-04-2020 @TartiiCat Well new/used Rift S are going for ... 1

44955 rows × 6 columns

In [43]:
# Train, test split wtih 80/20 split
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df_3, test_size=0.2)
train_data_y = train_data["Sentiment"]
train_data_x = train_data["OriginalTweet"]
test_data_y = test_data["Sentiment"]
test_data_x = test_data["OriginalTweet"]
In [50]:
test_data_y
Out[50]:
24448    1
2470     0
6796     1
22922    3
181      2
        ..
8426     0
2371     4
26589    2
26452    2
39898    4
Name: Sentiment, Length: 8991, dtype: int64
In [ ]:
# Default TextVectorization function usage to create a text vectorizer
#import tensorflow as tf
#from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more

# Use the default TextVectorization variables
#text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    #standardize="lower_and_strip_punctuation", # how to process text
                                    #split="whitespace", # how to split tokens
                                    #ngrams=None, # create groups of n-words?
                                   #output_mode="int", # how to map tokens to numbers
                                   #output_sequence_length=None) # how long should the output sequence of tokens be?
                                  # pad_to_max_tokens=True) # Not valid if using max_tokens=None
In [51]:
# Find average number of tokens (words) in training Tweets
round(sum([len(i.split()) for i in test_data_x])/len(test_data_x))
Out[51]:
31
In [53]:
# Setup text vectorization with custom variables
# For max_tokens (the number of words in the vocabulary), multiples of 10,000 (10,000, 20,000, 30,000)
#or the exact number of unique words in your text (e.g. 32,179) are common values.
# max length our sequences will be (e.g. how many words from a Tweet does our model see?)
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_vocab_length = 10000 
max_length = 31
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)
In [54]:
# Use .adapt method on training data
text_vectorizer.adapt(train_data_x)
In [56]:
# Test text vecotrizer 
sample_sentence = "I hate covid a lot and it sucks"
text_vectorizer([sample_sentence])
Out[56]:
<tf.Tensor: shape=(1, 31), dtype=int64, numpy=
array([[  15, 2198,   58,    7,  358,    4,   30, 2893,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int64)>
In [62]:
# Read a random tweet from training data and vectorize it
import random
random_sentence = random.choice(train_data_x)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])
Original text:
someone close to my family tested positive for COVID 19 all she did was go to the grocery store Stay your ass home bruh like stop being fucking stupid Who tf cares if you wanna be out in the streets      

Vectorized version:
Out[62]:
<tf.Tensor: shape=(1, 31), dtype=int64, numpy=
array([[ 354,  317,    3,   37,  273,  577,  436,   10,   58,   52,   33,
         263,  321,   74,   76,    3,    2,   23,   20,  115,   34, 1393,
          78,    1,   70,  142,  122,  785, 1230,   64, 5812]],
      dtype=int64)>
In [59]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")
Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'to', 'and']
Bottom 5 least common words: ['heri', 'hereÂ\x94', 'herbs', 'herbal', 'hemantsorenjmm']
In [63]:
# Create an embedding layer for neural network
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

embedding
Out[63]:
<keras.layers.core.embedding.Embedding at 0x26195c9ed00>
In [69]:
# Get a random sentence from training set
random_sentence = random.choice(train_data_x)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed,sample_embed[0][0]
Original text:
In January I tweeted that Trump Trade Wars and economic sanctions caused food shortages, panic and desperation which directly contributed to the creation of the #coronavirus and his ultimate control of Supply Chain. This was deliberate, systemic, resulting in deadly consequences      

Embedded version:
Out[69]:
(<tf.Tensor: shape=(1, 31, 128), dtype=float32, numpy=
 array([[[ 0.00257028, -0.04243337, -0.02343434, ..., -0.00133356,
           0.04663134,  0.00035417],
         [-0.01242752,  0.02142335, -0.02424029, ...,  0.04241223,
           0.0263625 , -0.04868319],
         [ 0.04266793,  0.03783042, -0.0123207 , ..., -0.02365079,
          -0.00954125, -0.01112708],
         ...,
         [-0.00710202,  0.03660338,  0.04749844, ..., -0.03742009,
          -0.04001515,  0.04663609],
         [ 0.03848192, -0.04844777,  0.0345685 , ..., -0.01402212,
          -0.04769395, -0.0404261 ],
         [-0.03521683, -0.00981399, -0.00284491, ..., -0.04792733,
          -0.01664252, -0.03293632]]], dtype=float32)>,
 <tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([ 2.5702827e-03, -4.2433370e-02, -2.3434341e-02,  8.0344789e-03,
         1.0745153e-03, -1.7929029e-02,  2.8977264e-02, -1.3977997e-03,
        -1.5810572e-02, -3.0726958e-02,  3.0961167e-02, -1.0513291e-03,
         4.6770964e-02, -2.5705492e-02,  4.7481630e-02,  1.6134176e-02,
         2.7894724e-02, -9.3294494e-03,  2.3151565e-02, -3.6803614e-02,
        -3.5249867e-02, -2.4860298e-02,  1.2418628e-03,  3.7282575e-02,
        -2.2311080e-02, -9.5168836e-03, -2.3742771e-02,  2.5782909e-02,
         3.8015518e-02, -3.2022476e-02, -4.2527914e-03, -8.2745180e-03,
        -7.3838979e-05, -9.4988942e-03,  6.7463890e-03, -4.8547637e-02,
        -1.4423311e-02, -2.5712121e-02,  4.5580339e-02, -1.8875754e-02,
         1.9473795e-02, -2.5681913e-02, -2.6389455e-02, -3.4608819e-02,
         4.3781772e-03, -4.2988770e-03, -1.9616604e-02, -3.3366576e-02,
        -3.6651753e-02,  2.1555666e-02, -3.0836686e-03,  3.6140036e-02,
         6.3757673e-03,  7.7042356e-03, -4.4482496e-02,  1.4548246e-02,
        -1.8304668e-02,  7.4930303e-03,  1.5066180e-02,  3.6475848e-02,
         1.1626970e-02, -2.6960596e-03, -1.1266552e-02, -1.1290491e-02,
         1.8606279e-02, -4.7292411e-02, -1.5093494e-02, -1.6912926e-02,
        -3.4358669e-02, -4.4665921e-02,  1.9346658e-02, -1.4779925e-02,
        -4.2759862e-02,  4.9276356e-02,  8.7143108e-04, -3.3007562e-02,
        -4.9956825e-02,  3.6116470e-02, -4.0328968e-02, -2.1484721e-02,
        -3.8409606e-03,  1.7727111e-02,  2.5041923e-03, -1.8877871e-03,
         2.1625366e-02, -1.3571978e-02,  8.3489306e-03,  4.6845045e-02,
         4.7346506e-02, -3.7503481e-02,  2.5321994e-02, -2.0060575e-02,
        -1.3522793e-02,  4.5044795e-03,  2.3088697e-02, -2.6669383e-02,
        -3.6229409e-02, -4.2654883e-02, -2.8200543e-02, -7.8269616e-03,
         4.2770315e-02,  4.2281751e-02,  1.0402847e-02,  1.9751079e-03,
        -4.3409895e-02, -2.8616060e-02, -4.6483517e-02, -4.7296502e-02,
        -7.2064623e-03, -4.0106405e-02, -1.0915063e-02,  2.9840302e-02,
        -2.2052431e-02,  3.7582148e-02, -2.5108470e-02,  4.0385414e-02,
         4.2068195e-02, -5.6435950e-03,  2.9582690e-02,  1.3326693e-02,
         3.9552812e-02,  7.6060779e-03, -2.6535273e-02,  4.8560772e-02,
         1.7586496e-02, -1.3335571e-03,  4.6631340e-02,  3.5417080e-04],
       dtype=float32)>)
In [72]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results
In [70]:
# Use a Multinomial Naive Bayes Model as a baseline
# TfidfVectorizer() is a Feature extraction technique that converts the text data into numerical features using 
# the Term Frequency-Inverse Document Frequency (TF-IDF) 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_data_x, train_data_y)
Out[70]:
Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
In [87]:
# Check results
model_0_results = calculate_results(y_true=test_data_y, y_pred=model_0.predict(test_data_x))
model_0_results 
Out[87]:
{'accuracy': 35.50216883550217,
 'precision': 0.6135211640206998,
 'recall': 0.3550216883550217,
 'f1': 0.24722328062848883}
In [77]:
# Use a tensorflow simple dense model
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(5, activation="softmax")(x) 
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") 

model_1.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

history_1 = model_1.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_data_y,
                              epochs=5,
                              validation_data=(test_data_x, test_data_y))
Epoch 1/5
1124/1124 [==============================] - 15s 13ms/step - loss: 1.3669 - accuracy: 0.4093 - val_loss: 1.1985 - val_accuracy: 0.4977
Epoch 2/5
1124/1124 [==============================] - 14s 12ms/step - loss: 1.0428 - accuracy: 0.5981 - val_loss: 1.0484 - val_accuracy: 0.5781
Epoch 3/5
1124/1124 [==============================] - 14s 12ms/step - loss: 0.8619 - accuracy: 0.6927 - val_loss: 1.0119 - val_accuracy: 0.6086
Epoch 4/5
1124/1124 [==============================] - 14s 13ms/step - loss: 0.7496 - accuracy: 0.7428 - val_loss: 1.0160 - val_accuracy: 0.6165
Epoch 5/5
1124/1124 [==============================] - 15s 13ms/step - loss: 0.6721 - accuracy: 0.7755 - val_loss: 1.0519 - val_accuracy: 0.6151
In [83]:
# Use a LSTM model
tf.random.set_seed(42)
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_2")

# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
# x = layers.LSTM(64, return_sequences=True)(x) # return vector for each word in the Tweet (you can stack RNN cells as long as return_sequences=True)
x = layers.LSTM(64)(x) # return vector for whole sequence
print(x.shape)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(5, activation="softmax")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

model_2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

history_2 = model_2.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_data_y,
                              epochs=5,
                              validation_data=(test_data_x, test_data_y))
(None, 31, 128)
(None, 64)
Epoch 1/5
1124/1124 [==============================] - 29s 24ms/step - loss: 1.1258 - accuracy: 0.5354 - val_loss: 0.8976 - val_accuracy: 0.6538
Epoch 2/5
1124/1124 [==============================] - 27s 24ms/step - loss: 0.7640 - accuracy: 0.7199 - val_loss: 0.8219 - val_accuracy: 0.6944
Epoch 3/5
1124/1124 [==============================] - 27s 24ms/step - loss: 0.6431 - accuracy: 0.7736 - val_loss: 0.8477 - val_accuracy: 0.6867
Epoch 4/5
1124/1124 [==============================] - 26s 23ms/step - loss: 0.5376 - accuracy: 0.8151 - val_loss: 0.9264 - val_accuracy: 0.6751
Epoch 5/5
1124/1124 [==============================] - 26s 23ms/step - loss: 0.4387 - accuracy: 0.8521 - val_loss: 1.0318 - val_accuracy: 0.6717
In [85]:
# Use a GRU model
tf.random.set_seed(42)
from tensorflow.keras import layers
model_3_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_3")

# Build an RNN using the GRU cell
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_3_embedding(x)
# x = layers.GRU(64, return_sequences=True) # stacking recurrent cells requires return_sequences=True
x = layers.GRU(64)(x) 
# x = layers.Dense(64, activation="relu")(x) # optional dense layer after GRU cell
outputs = layers.Dense(5, activation="softmax")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")


model_3.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit model
history_3 = model_3.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_data_y,
                              epochs=5,
                              validation_data=(test_data_x, test_data_y))
Epoch 1/5
1124/1124 [==============================] - 30s 25ms/step - loss: 1.1551 - accuracy: 0.5164 - val_loss: 0.8801 - val_accuracy: 0.6639
Epoch 2/5
1124/1124 [==============================] - 27s 24ms/step - loss: 0.7519 - accuracy: 0.7256 - val_loss: 0.8142 - val_accuracy: 0.6995
Epoch 3/5
1124/1124 [==============================] - 27s 24ms/step - loss: 0.6182 - accuracy: 0.7800 - val_loss: 0.8299 - val_accuracy: 0.6922
Epoch 4/5
1124/1124 [==============================] - 28s 25ms/step - loss: 0.4928 - accuracy: 0.8313 - val_loss: 0.9421 - val_accuracy: 0.6687
Epoch 5/5
1124/1124 [==============================] - 29s 26ms/step - loss: 0.3799 - accuracy: 0.8734 - val_loss: 1.0371 - val_accuracy: 0.6637
In [88]:
# Use a bidirectional RNN model
tf.random.set_seed(42)
from tensorflow.keras import layers
model_4_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_4")

# Build a Bidirectional RNN in TensorFlow
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)
# x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) # stacking RNN layers requires return_sequences=True
x = layers.Bidirectional(layers.LSTM(64))(x) # bidirectional goes both ways so has double the parameters of a regular LSTM layer
outputs = layers.Dense(5, activation="softmax")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")
# Compile
model_4.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the model (takes longer because of the bidirectional layers)
history_4 = model_4.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_data_y,
                              epochs=5,
                              validation_data=(test_data_x, test_data_y))
Epoch 1/5
1124/1124 [==============================] - 42s 34ms/step - loss: 1.0945 - accuracy: 0.5523 - val_loss: 0.8854 - val_accuracy: 0.6619
Epoch 2/5
1124/1124 [==============================] - 37s 33ms/step - loss: 0.7448 - accuracy: 0.7264 - val_loss: 0.8200 - val_accuracy: 0.6936
Epoch 3/5
1124/1124 [==============================] - 37s 33ms/step - loss: 0.6063 - accuracy: 0.7808 - val_loss: 0.8650 - val_accuracy: 0.6862
Epoch 4/5
1124/1124 [==============================] - 38s 33ms/step - loss: 0.4742 - accuracy: 0.8311 - val_loss: 0.9551 - val_accuracy: 0.6679
Epoch 5/5
1124/1124 [==============================] - 38s 34ms/step - loss: 0.3461 - accuracy: 0.8809 - val_loss: 1.1107 - val_accuracy: 0.6707
In [90]:
# Use a 1-d convolutional layer model
tf.random.set_seed(42)
from tensorflow.keras import layers
model_5_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_5")

# Create 1-dimensional convolutional layer to model sequences
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_5_embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer
outputs = layers.Dense(5, activation="softmax")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")

# Compile Conv1D model
model_5.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
history_5 =  model_5.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_data_y,
                              epochs=5,
                              validation_data=(test_data_x, test_data_y))
Epoch 1/5
1124/1124 [==============================] - 16s 14ms/step - loss: 1.1869 - accuracy: 0.5063 - val_loss: 1.0052 - val_accuracy: 0.6145
Epoch 2/5
1124/1124 [==============================] - 15s 13ms/step - loss: 0.8448 - accuracy: 0.6870 - val_loss: 0.9755 - val_accuracy: 0.6220
Epoch 3/5
1124/1124 [==============================] - 15s 14ms/step - loss: 0.6060 - accuracy: 0.7933 - val_loss: 1.0282 - val_accuracy: 0.6161
Epoch 4/5
1124/1124 [==============================] - 15s 14ms/step - loss: 0.3631 - accuracy: 0.8916 - val_loss: 1.1664 - val_accuracy: 0.6012
Epoch 5/5
1124/1124 [==============================] - 16s 15ms/step - loss: 0.1853 - accuracy: 0.9544 - val_loss: 1.3613 - val_accuracy: 0.5854
In [92]:
# Use a transfer learning model with sentence encoder
import tensorflow_hub as hub
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[], # shape of inputs coming to our model 
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=False, # keep the pretrained weights (we'll create a feature extractor)
                                        name="USE") 

# Create model using the Sequential API
model_6 = tf.keras.Sequential([
  sentence_encoder_layer, # take in sentences and then encode them into an embedding
  layers.Dense(64, activation="relu"),
  layers.Dense(5, activation="softmax")
], name="model_6_USE")

# Compile model
model_6.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Train a classifier on top of pretrained embeddings
history_6 = model_6.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_data_y,
                              epochs=5,
                              validation_data=(test_data_x, test_data_y))
Epoch 1/5
1124/1124 [==============================] - 12s 9ms/step - loss: 1.2914 - accuracy: 0.4389 - val_loss: 1.2366 - val_accuracy: 0.4670
Epoch 2/5
1124/1124 [==============================] - 9s 8ms/step - loss: 1.2098 - accuracy: 0.4787 - val_loss: 1.2213 - val_accuracy: 0.4750
Epoch 3/5
1124/1124 [==============================] - 9s 8ms/step - loss: 1.1948 - accuracy: 0.4882 - val_loss: 1.2169 - val_accuracy: 0.4791
Epoch 4/5
1124/1124 [==============================] - 9s 8ms/step - loss: 1.1810 - accuracy: 0.4942 - val_loss: 1.2119 - val_accuracy: 0.4750
Epoch 5/5
1124/1124 [==============================] - 9s 8ms/step - loss: 1.1674 - accuracy: 0.5023 - val_loss: 1.2093 - val_accuracy: 0.4843
In [118]:
model_0_results = calculate_results(y_true=test_data_y, y_pred=model_0.predict(test_data_x))
model_1_results = calculate_results(y_true=test_data_y, y_pred=model_1.predict(test_data_x).argmax(axis=1))
model_2_results = calculate_results(y_true=test_data_y, y_pred=model_2.predict(test_data_x).argmax(axis=1))
model_3_results = calculate_results(y_true=test_data_y, y_pred=model_3.predict(test_data_x).argmax(axis=1))
model_4_results = calculate_results(y_true=test_data_y, y_pred=model_4.predict(test_data_x).argmax(axis=1))
model_5_results = calculate_results(y_true=test_data_y, y_pred=model_5.predict(test_data_x).argmax(axis=1))
model_6_results = calculate_results(y_true=test_data_y, y_pred=model_6.predict(test_data_x).argmax(axis=1))
ensemble_results = calculate_results(y_true=test_data_y,  y_pred=((model_2.predict(test_data_x) + model_3.predict(test_data_x) + model_4.predict(test_data_x))/3).argmax(axis=1))
281/281 [==============================] - 0s 1ms/step
281/281 [==============================] - 2s 6ms/step
281/281 [==============================] - 2s 7ms/step
281/281 [==============================] - 2s 8ms/step
281/281 [==============================] - 1s 2ms/step
281/281 [==============================] - 2s 6ms/step
281/281 [==============================] - 1s 5ms/step
281/281 [==============================] - 1s 5ms/step
281/281 [==============================] - 2s 7ms/step
In [119]:
# Combine model results into a DataFrame
all_model_results = pd.DataFrame({"naive bayes": model_0_results,
                                  "simple_dense": model_1_results,
                                  "lstm": model_2_results,
                                  "gru": model_3_results,
                                  "bidirectional": model_4_results,
                                  "conv1d": model_5_results,
                                  "tf_hub_sentence_encoder": model_6_results,
                                   "ensemble_results":ensemble_results})
all_model_results = all_model_results.transpose()
# Reduce the accuracy to same scale as other metrics
all_model_results["accuracy"] = all_model_results["accuracy"]/100
all_model_results
Out[119]:
accuracy precision recall f1
naive bayes 0.355022 0.613521 0.355022 0.247223
simple_dense 0.615060 0.619968 0.615060 0.614946
lstm 0.671672 0.672680 0.671672 0.671852
gru 0.663664 0.666331 0.663664 0.664719
bidirectional 0.670671 0.674650 0.670671 0.671100
conv1d 0.585363 0.584906 0.585363 0.584887
tf_hub_sentence_encoder 0.484262 0.488389 0.484262 0.484484
ensemble_results 0.688466 0.690916 0.688466 0.689124
In [120]:
# Plot and compare all of the model results
all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0));
In [122]:
# Sort model results by f1-score
all_model_results.sort_values("f1", ascending=False)["f1"].plot(kind="bar", figsize=(10, 7));
In [123]:
# Want to get weights of embedding layer for LSTM model
model_2.summary()
Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_6 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 31)               0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 31, 128)           1280000   
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense_5 (Dense)             (None, 5)                 325       
                                                                 
=================================================================
Total params: 1,329,733
Trainable params: 1,329,733
Non-trainable params: 0
_________________________________________________________________
In [128]:
# Save weights and vocab of LSTM model for tensorflow projector
import io
weights = model_2.get_layer('embedding_2').get_weights()[0]
vocab = text_vectorizer.get_vocabulary()
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')
for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()
In [131]:
# Calculate the time of predictions
import time
def pred_timer(model, samples):
  """
  Times how long a model takes to make predictions on samples.
  
  Args:
  ----
  model = a trained model
  sample = a list of samples

  Returns:
  ----
  total_time = total elapsed time for model to make predictions on samples
  time_per_pred = time in seconds per single sample
  """
  start_time = time.perf_counter() # get start time
  model.predict(samples) # make predictions
  end_time = time.perf_counter() # get finish time
  total_time = end_time-start_time # calculate how long predictions took to make
  time_per_pred = total_time/len(test_data_x) # find prediction time per sample
  return total_time, time_per_pred
In [134]:
# Calculate prediction times for all models
model_0_total_pred_time, model_0_time_per_pred = pred_timer(model_0, test_data_x)
model_1_total_pred_time, model_1_time_per_pred = pred_timer(model_1, test_data_x)
model_2_total_pred_time, model_2_time_per_pred = pred_timer(model_2, test_data_x)
model_3_total_pred_time, model_3_time_per_pred = pred_timer(model_3, test_data_x)
model_4_total_pred_time, model_4_time_per_pred = pred_timer(model_4, test_data_x)
model_5_total_pred_time, model_5_time_per_pred = pred_timer(model_5, test_data_x)
model_6_total_pred_time, model_6_time_per_pred = pred_timer(model_6, test_data_x)
281/281 [==============================] - 0s 2ms/step
281/281 [==============================] - 1s 5ms/step
281/281 [==============================] - 2s 6ms/step
281/281 [==============================] - 2s 8ms/step
281/281 [==============================] - 1s 2ms/step
281/281 [==============================] - 2s 6ms/step
In [135]:
# Make Scatter plot
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 7))
plt.scatter(model_0_time_per_pred, model_0_results["f1"], label="naive bayes")
plt.scatter(model_1_time_per_pred, model_1_results["f1"], label="simple_dense")
plt.scatter(model_2_time_per_pred, model_2_results["f1"], label="lstm")
plt.scatter(model_3_time_per_pred, model_3_results["f1"], label="gru")
plt.scatter(model_4_time_per_pred, model_4_results["f1"], label="bidirectional")
plt.scatter(model_5_time_per_pred, model_5_results["f1"], label="conv1d")
plt.scatter(model_6_time_per_pred, model_6_results["f1"], label="tf_hub_sentence_encoder")
plt.legend()
plt.title("F1-score versus time per prediction")
plt.xlabel("Time per prediction")
plt.ylabel("F1-Score");